#Importing neccessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas_profiling as pp
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from itertools import product
import seaborn as sns
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score
from sklearn.decomposition import TruncatedSVD
from yellowbrick.cluster import SilhouetteVisualizer
import warnings
warnings.filterwarnings("ignore")
import plotly.offline
plotly.offline.init_notebook_mode()
#Reading files
customers=pd.read_csv('customers_dataset.csv')
orders=pd.read_csv('orders_dataset.csv')
order_items=pd.read_csv('order_items_dataset.csv')
products=pd.read_csv('products_dataset.csv')
transl=pd.read_csv('product_category_name_translation.csv')
#Changing necessary datatypes
orders=orders.astype({'order_purchase_timestamp':'datetime64', 'order_approved_at': 'datetime64',
'order_delivered_carrier_date': 'datetime64', 'order_delivered_customer_date': 'datetime64',
'order_estimated_delivery_date': 'datetime64'})
order_items=order_items.astype({'shipping_limit_date':'datetime64'})
#Fast dataframe profiling of customers
customers_prof = pp.ProfileReport(customers)
customers_prof.to_file("customers_prof.html")
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Export report to file: 0%| | 0/1 [00:00<?, ?it/s]
#Fast dataframe profiling of orders
orders_prof = pp.ProfileReport(orders)
orders_prof.to_file("orders_prof.html")
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Export report to file: 0%| | 0/1 [00:00<?, ?it/s]
set(orders['customer_id'])==set(customers['customer_id'])
True
orders_customers=orders.merge(customers, on='customer_id', how='left')
#Fast dataframe profiling of orders_customers dataframe
orders_customers_prof = pp.ProfileReport(orders_customers)
orders_customers_prof.to_file("orders_customers.html")
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Export report to file: 0%| | 0/1 [00:00<?, ?it/s]
#Fast dataframe profiling of order_items dataframe
order_items_prof = pp.ProfileReport(order_items)
order_items_prof.to_file("order_items_prof.html")
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Export report to file: 0%| | 0/1 [00:00<?, ?it/s]
orders_customers[~orders_customers['order_id'].isin(list(order_items['order_id']))].value_counts('order_status')
order_status unavailable 603 canceled 164 created 5 invoiced 2 shipped 1 dtype: int64
orders_customers[orders_customers['order_id'].isin(list(order_items['order_id']))].value_counts('order_status')
order_status delivered 96478 shipped 1106 canceled 461 invoiced 312 processing 301 unavailable 6 approved 2 dtype: int64
orders_customers[~orders_customers['order_id'].isin(list(order_items['order_id']))]["order_delivered_customer_date"].isna().sum()/len(orders_customers[~orders_customers['order_id'].isin(list(order_items['order_id']))])
1.0
orders_customers[orders_customers['order_id'].isin(list(order_items['order_id']))]["order_delivered_customer_date"].isna().sum()/len(orders_customers[orders_customers['order_id'].isin(list(order_items['order_id']))])
0.022196095919567026
orders_customers_items=orders_customers.merge(order_items, on='order_id', how='outer')
#Fast dataframe profiling of orders_customers_items dataframe
orders_customers_items_prof = pp.ProfileReport(orders_customers_items)
orders_customers_items_prof.to_file("orders_customers_items.html")
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Export report to file: 0%| | 0/1 [00:00<?, ?it/s]
#Fast dataframe profiling of products dataframe
products_prof = pp.ProfileReport(products)
products_prof.to_file("products_prof.html")
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Export report to file: 0%| | 0/1 [00:00<?, ?it/s]
orders_customers_items_prod=orders_customers_items.merge(products, on='product_id', how='left')
orders_all=orders_customers_items_prod.merge(transl, on='product_category_name', how='left')
orders_all.columns
Index(['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp',
'order_approved_at', 'order_delivered_carrier_date',
'order_delivered_customer_date', 'order_estimated_delivery_date',
'customer_unique_id', 'customer_zip_code_prefix', 'customer_city',
'customer_state', 'order_item_id', 'product_id', 'seller_id',
'shipping_limit_date', 'price', 'freight_value',
'product_category_name', 'product_name_lenght',
'product_description_lenght', 'product_photos_qty', 'product_weight_g',
'product_length_cm', 'product_height_cm', 'product_width_cm',
'product_category_name_english'],
dtype='object')
orders_all=orders_all.drop(columns=['customer_id', 'product_category_name'])
#Would like to have order day and month for future purposes
orders_all['order_day']=orders_all['order_purchase_timestamp'].apply(lambda x: x.date())
orders_all['order_month']=orders_all['order_purchase_timestamp'].apply(lambda x: x.to_period('M').to_timestamp())
orders_all=orders_all.astype({'order_day':'datetime64[ns]', 'customer_zip_code_prefix':'str'})
#Counting the number of customers in each state then city and correcting the city names
df=orders_all.groupby(['customer_state', 'customer_city'])['customer_unique_id'].nunique().reset_index()
df['city']=df['customer_city'].apply(lambda x: x.title())
#Plotting treemap for states and cities (number of customers)
fig = px.treemap(df, path=['customer_state', 'city'], values='customer_unique_id')
fig.data[0].textinfo = 'label+text+value'
fig.layout.hovermode = False
fig.show()
#Same thing for product_categories, individual products will make it useless
fig = px.treemap(orders_all.groupby(['product_category_name_english'])['customer_unique_id'].nunique().reset_index(), path=['product_category_name_english'], values='customer_unique_id')
fig.data[0].textinfo = 'label+text+value'
fig.layout.hovermode = False
fig.show()
#Monthly revenue from orders which were not cancelled
monthly=pd.DataFrame(orders_all[~orders_all['order_status'].isin(['canceled', 'unavailable'])].groupby('order_month')['price'].sum()).reset_index()
#September is incomplete, thus the limit
monthly=monthly[monthly['order_month']<=pd.Timestamp('2018-08-01')]
#Monthly sales revenue plot
fig = px.line(monthly, x='order_month', y='price')
fig.update_layout(
title={
'text': "Monthly sales revenue",
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
xaxis_title="Month",
yaxis_title="Sales Revenue",
)
fig.show()
#Calculating the ratio of prices in the current month with the one for Jan 2018
#Selecting the columns we need
top=orders_all[['order_month', 'product_id', 'price']]
#Select only the orders which happened after Jan 2018
top=top[top['order_month'].isin([pd.Timestamp('2018-01-01'), pd.Timestamp('2018-02-01'), pd.Timestamp('2018-03-01'), pd.Timestamp('2018-04-01'), pd.Timestamp('2018-05-01'), pd.Timestamp('2018-06-01'),
pd.Timestamp('2018-07-01'), pd.Timestamp('2018-08-01')])]
#Mean price of each product every month
top=pd.DataFrame(top.groupby(['order_month', 'product_id'])['price'].mean()).reset_index()
top=top.sort_values(['product_id', 'order_month']).reset_index(drop=True)
#Make another dataframe with all encountered product_id and months
indexing=pd.DataFrame(list(product([pd.Timestamp('2018-01-01'), pd.Timestamp('2018-02-01'), pd.Timestamp('2018-03-01'), pd.Timestamp('2018-04-01'), pd.Timestamp('2018-05-01'), pd.Timestamp('2018-06-01'),
pd.Timestamp('2018-07-01'), pd.Timestamp('2018-08-01')], list(top['product_id'].unique()))), columns=['order_month', 'product_id'])
#Merge with what we have already
indexing=indexing.merge(top, on=['order_month', 'product_id'], how="left")
#Keep the products for which all months data exists since Jan 2018
indexing=indexing.groupby('product_id').filter(lambda g: g.isnull().sum().sum() ==0).sort_values(['product_id', 'order_month']).reset_index(drop=True)
indexing = indexing.set_index(['product_id','order_month'])
#Calculation of ratio between the current month and the Jan 2018
firsts = (indexing.groupby(level=['product_id']).transform('first'))
result = indexing / firsts
result=result.reset_index()
result=result.groupby('order_month')['price'].mean()
#Plotting the ratios
fig = px.line(result)
fig.update_layout(
title={
'text': "Monthly average prices compared to Jan 2018",
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
xaxis_title="Month",
yaxis_title="Prices ratio",
)
fig.update_layout(showlegend=False)
fig.show()
print('The number of products this calculation was based on is '+str(indexing.reset_index().product_id.nunique()))
The number of products this calculation was based on is 96
#Would like to have order day and month for future purposes
orders['order_day']=orders['order_purchase_timestamp'].apply(lambda x: x.date())
orders['order_month']=orders['order_purchase_timestamp'].apply(lambda x: x.to_period('M').to_timestamp())
orders=orders.astype({'order_day':'datetime64[ns]'})
#This part is to plot estimated delivery duration that customers usually see and decide if they want to wait that long
orders['min_diff_estimated'] = (orders.order_estimated_delivery_date - orders.order_purchase_timestamp) / pd.Timedelta(weeks=1)
min_diff_estimated=pd.DataFrame(orders.groupby('order_day')['min_diff_estimated'].mean().reset_index())
fig = px.line(min_diff_estimated[min_diff_estimated['order_day']>=pd.Timestamp('2017-01-05')], x="order_day", y="min_diff_estimated")
fig.update_layout(
title={
'text': "Estimated average delivery time in weeks",
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
xaxis_title="Month",
yaxis_title="Delivery time in weeks",
)
fig.show()
#Plot the number of orders, September is incomplete, thus the limit
num_of_orders=pd.DataFrame(orders.groupby(['order_month'])['order_id'].count().reset_index())
num_of_orders=num_of_orders[num_of_orders['order_month']<=pd.Timestamp('2018-08-01')]
fig = px.line(num_of_orders, x="order_month", y="order_id")
fig.update_layout(
title={
'text': "Monthly number of orders",
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
xaxis_title="Month",
yaxis_title="Number of orders",
)
fig.show()
#Counting the number of new customers on a monthly level
new_customers=pd.DataFrame(orders_all.groupby(['order_month'])['customer_unique_id'].apply(list).reset_index(name='new'))
new_customers['num_new']=None
new_customers['union']=None
new_customers=new_customers.sort_values(['order_month']).reset_index(drop=True)
new_customers['union'][0]=set(new_customers['new'][0])
#For every subsequent row that would be length of the set difference between what we already have and new customers set
for i in range(1, len(new_customers)):
new_customers['union'][i]=set(new_customers['new'][i]).union(set(new_customers['new'][i-1]))
new_customers['num_new'][i]=len(set(new_customers['new'][i])-set(new_customers['union'][i-1]))
#September is incomplete thus the filter
new_customers=new_customers[new_customers['order_month']<=pd.Timestamp('2018-08-01')]
fig = px.line(new_customers, x="order_month", y='num_new')
fig.update_layout(
title={
'text': "Monthly number of new customers",
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
xaxis_title="Month",
yaxis_title="Number of new customers",
)
fig.show()
#Couting the number of customers with different number of orders
number_of_orders=pd.DataFrame(orders_all.groupby('customer_unique_id')['order_id'].nunique()).reset_index()
number_of_orders=pd.DataFrame(number_of_orders.groupby('order_id').size()).reset_index()
fig = px.bar(number_of_orders, x="order_id", y=0)
fig.update_layout(
title={
'text': "Number of customers with different number of orders",
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
xaxis_title="Number of orders",
yaxis_title="Number of customers",
)
fig.show()
#Couting the number of orders with different number of items
number_of_items=pd.DataFrame(orders_all.groupby('order_id').size()).reset_index()
number_of_items=number_of_items.rename(columns={0:'count'})
number_of_items=pd.DataFrame(number_of_items.groupby('count').size()).reset_index()
fig = px.bar(number_of_items, x="count", y=0)
fig.update_layout(
title={
'text': "Number of orders with different number of items",
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
xaxis_title="Number of items",
yaxis_title="Number of orders",
)
fig.show()
#Couting the number of orders with different number of items
products=pd.DataFrame(orders_all.groupby(['order_month', 'product_category_name_english']).size().reset_index())
products=products.rename(columns={'product_category_name_english':'Category'})
#To keep the time interval consistent and the uselessness of the data before we filter from Jan 2017
products=products[products['order_month']>=pd.Timestamp('2017-01-01')]
fig = px.bar(products[products['Category'].isin(list(orders_all.groupby('product_category_name_english')['price'].sum().nlargest(n=5).index))], x='order_month', y=0, color="Category",
category_orders={"order_month": ["watches_gifts", "sports_leisure", "health_beauty", "computers_accessories", 'bed_bath_table']})
fig.update_layout(
title={
'text': "Number of items ordered for top 5 highest revenue product categories",
'y':0.95,
'x':0.4,
'xanchor': 'center',
'yanchor': 'top'},
xaxis_title="Month",
yaxis_title="Number of items ordered",
)
fig.show()
#Select top 10 most sold product groups as the features
needed_list=list(orders_all.groupby('product_category_name_english').size().nlargest(n=10).index)
#To join some other measure into the dataframe we need to keep the customer_unique_id
needed_list.append('customer_unique_id')
#Average spending per customer everytime they make an order
by_order=pd.DataFrame(orders_all.groupby(['customer_unique_id', 'order_id'])['price'].sum()).reset_index()
average_spending=pd.DataFrame(by_order[['customer_unique_id', 'price']].groupby("customer_unique_id")['price'].mean()).reset_index()
#Total spending per customer
total_spending=pd.DataFrame(orders_all.groupby('customer_unique_id')['price'].sum()).reset_index()
total_spending=total_spending.rename(columns={'price':'total_spendings'})
#Total number of orders per customer
num_of_orders=pd.DataFrame(orders_all.groupby(['customer_unique_id'])['order_id'].nunique()).reset_index()
#Average number of items purchased per order
num_of_items=pd.DataFrame(orders_all.groupby(['customer_unique_id', 'order_id']).size()).reset_index()
num_of_items=num_of_items.rename(columns={0:'num_items'})
avg_num_of_items=pd.DataFrame(num_of_items[['customer_unique_id', 'num_items']].groupby("customer_unique_id")['num_items'].mean()).reset_index()
#Total number of items ordered
num_of_items_sum=pd.DataFrame(orders_all.groupby(['customer_unique_id', 'order_id']).size()).reset_index()
num_of_items_sum=num_of_items_sum.rename(columns={0:'num_items_total'})
total_num_of_items=pd.DataFrame(num_of_items_sum[['customer_unique_id', 'num_items_total']].groupby("customer_unique_id")['num_items_total'].sum()).reset_index()
#Calculating the number of times each product type was bought by each customer
category_counts=pd.DataFrame(orders_all.groupby(['customer_unique_id', 'product_category_name_english']).size()).reset_index()
category_counts=category_counts.rename(columns={0:'count'})
category_counts=pd.pivot_table(category_counts, values='count', index='customer_unique_id', columns=['product_category_name_english']).reset_index()
category_counts=category_counts.rename_axis(None, axis=1)
category_counts=category_counts.fillna(0)
#Then keep only the 10 feature that we have selected
category_counts=category_counts[needed_list]
#Joining the tables
category_counts=category_counts.merge(num_of_orders, on='customer_unique_id', how='left')
category_counts=category_counts.merge(total_spending, on='customer_unique_id', how='left')
category_counts=category_counts.merge(total_num_of_items, on='customer_unique_id', how='left')
category_counts=category_counts.rename(columns={'num_items':'avg_num_items_bought', 'order_id':'num_orders', 'price':'average_spending'})
#For SVD dimension reduction it would be nicer to have values between 0 and 1 (good at handling sparse data)
scaler=MinMaxScaler()
#Scale the data and get the scaled version of it
new_input = scaler.fit_transform(category_counts.drop(columns=['customer_unique_id']))
#7 components cover 81% of the variance
svd = TruncatedSVD(n_components=7, n_iter=10, random_state=100)
svd.fit(new_input)
print(svd.explained_variance_ratio_)
print(svd.explained_variance_ratio_.sum())
[0.16880041 0.16941056 0.14626038 0.11332308 0.0880388 0.06928287 0.05571084] 0.8108269529470031
#Reduce the 14 dimensions to 7 dimensions that explain 81% of the variance
data=svd.transform(new_input)
#Deciding on the number of clusters for the KMeans algorithm
Elbow_M = KElbowVisualizer(KMeans(random_state=100), k=10)
Elbow_M.fit(data)
Elbow_M.show()
<AxesSubplot:title={'center':'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>
#Silhouette score plot will give better idea on which k to choose. I select and search in [3,4,5,6]
fig, ax = plt.subplots(2, 2, figsize=(15,8))
for i in [3, 4, 5, 6]:
'''
Create KMeans instance for different number of clusters
'''
km = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=100, random_state=42)
q, mod = divmod(i-3, 2)
'''
Create SilhouetteVisualizer instance with KMeans instance
Fit the visualizer
'''
visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[q][mod])
visualizer.fit(data)
#Run the 4-Means clustering algorithm and predict
kmeans = KMeans( n_clusters = 4, init='k-means++', random_state=111)
yhat_km=kmeans.fit_predict(data)
fig = ff.create_annotated_heatmap(svd.inverse_transform(kmeans.cluster_centers_), x=list(category_counts.drop(columns=['customer_unique_id']).columns), y=['Segment 1','Segment 2','Segment 3','Segment 4'], annotation_text=[['']*13]*4, showscale=True)
fig.show()
fig = px.imshow(pd.DataFrame(scaler.inverse_transform(svd.inverse_transform(kmeans.cluster_centers_)), columns=category_counts.drop(columns=['customer_unique_id']).columns), aspect="auto", text_auto=True)
fig.show()
#I would like to look at the sales on a daily level to get some insights
daily=pd.DataFrame(orders_all[~orders_all['order_status'].isin(['canceled', 'unavailable'])].groupby([orders_all['order_purchase_timestamp'].dt.date])['price'].sum())
#Changing the datatype
daily=daily.reset_index()
daily=daily.astype({'order_purchase_timestamp':'datetime64[ns]'})
#Remove irrelevant, irregular data from the past
daily=daily[daily['order_purchase_timestamp']>=pd.Timestamp('2017-01-05')]
#Plotting
fig = px.scatter(daily, x='order_purchase_timestamp', y='price', trendline="lowess", trendline_options=dict(frac=0.07), trendline_color_override="darkblue")
fig.update_traces(mode = 'lines')
fig.update_layout(
title={
'text': "Daily sales revenue",
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
xaxis_title="Sales Date",
yaxis_title="Revenue",
)
fig.add_annotation(x='2017-10-20', y=90000,
text="Black Friday",
showarrow=False,
font=dict(
size=10,
))
fig.add_annotation(x='2017-11-30', y=10000,
text="Christmas",
showarrow=False,
font=dict(
size=10,
))
fig.show()